1 package org.apache.lucene.index;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.IOException;
21 import java.util.Arrays;
22 import java.util.Random;
23
24 import org.apache.lucene.analysis.*;
25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
27 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
28 import org.apache.lucene.codecs.Codec;
29 import org.apache.lucene.codecs.TermVectorsReader;
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.document.Field;
32 import org.apache.lucene.document.FieldType;
33 import org.apache.lucene.document.StoredField;
34 import org.apache.lucene.document.TextField;
35 import org.apache.lucene.search.DocIdSetIterator;
36 import org.apache.lucene.store.Directory;
37 import org.apache.lucene.util.BytesRef;
38 import org.apache.lucene.util.LuceneTestCase;
39 import org.apache.lucene.util.TestUtil;
40
41 public class TestTermVectorsReader extends LuceneTestCase {
42
43 private String[] testFields = {"f1", "f2", "f3", "f4"};
44 private boolean[] testFieldsStorePos = {true, false, true, false};
45 private boolean[] testFieldsStoreOff = {true, false, false, true};
46 private String[] testTerms = {"this", "is", "a", "test"};
47 private int[][] positions = new int[testTerms.length][];
48 private Directory dir;
49 private SegmentCommitInfo seg;
50 private FieldInfos fieldInfos = new FieldInfos(new FieldInfo[0]);
51 private static int TERM_FREQ = 3;
52
53 private class TestToken implements Comparable<TestToken> {
54 String text;
55 int pos;
56 int startOffset;
57 int endOffset;
58 @Override
59 public int compareTo(TestToken other) {
60 return pos - other.pos;
61 }
62 }
63
64 TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];
65
66 @Override
67 public void setUp() throws Exception {
68 super.setUp();
69
70
71
72
73
74
75 Arrays.sort(testTerms);
76 int tokenUpto = 0;
77 Random rnd = random();
78 for (int i = 0; i < testTerms.length; i++) {
79 positions[i] = new int[TERM_FREQ];
80
81 for (int j = 0; j < TERM_FREQ; j++) {
82
83 positions[i][j] = (int) (j * 10 + rnd.nextDouble() * 10);
84 TestToken token = tokens[tokenUpto++] = new TestToken();
85 token.text = testTerms[i];
86 token.pos = positions[i][j];
87 token.startOffset = j * 10;
88 token.endOffset = j * 10 + testTerms[i].length();
89 }
90 }
91 Arrays.sort(tokens);
92
93 dir = newDirectory();
94 IndexWriter writer = new IndexWriter(
95 dir,
96 newIndexWriterConfig(new MyAnalyzer()).
97 setMaxBufferedDocs(-1).
98 setMergePolicy(newLogMergePolicy(false, 10))
99 .setUseCompoundFile(false)
100 );
101
102 Document doc = new Document();
103 for(int i=0;i<testFields.length;i++) {
104 FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
105 if (testFieldsStorePos[i] && testFieldsStoreOff[i]) {
106 customType.setStoreTermVectors(true);
107 customType.setStoreTermVectorPositions(true);
108 customType.setStoreTermVectorOffsets(true);
109 }
110 else if (testFieldsStorePos[i] && !testFieldsStoreOff[i]) {
111 customType.setStoreTermVectors(true);
112 customType.setStoreTermVectorPositions(true);
113 }
114 else if (!testFieldsStorePos[i] && testFieldsStoreOff[i]) {
115 customType.setStoreTermVectors(true);
116 customType.setStoreTermVectorPositions(true);
117 customType.setStoreTermVectorOffsets(true);
118 }
119 else {
120 customType.setStoreTermVectors(true);
121 }
122 doc.add(new Field(testFields[i], "", customType));
123 }
124
125
126
127 for(int j=0;j<5;j++) {
128 writer.addDocument(doc);
129 }
130 writer.commit();
131 seg = writer.newestSegment();
132 writer.close();
133
134 fieldInfos = IndexWriter.readFieldInfos(seg);
135 }
136
137 @Override
138 public void tearDown() throws Exception {
139 dir.close();
140 super.tearDown();
141 }
142
143 private class MyTokenizer extends Tokenizer {
144 private int tokenUpto;
145
146 private final CharTermAttribute termAtt;
147 private final PositionIncrementAttribute posIncrAtt;
148 private final OffsetAttribute offsetAtt;
149
150 public MyTokenizer() {
151 super();
152 termAtt = addAttribute(CharTermAttribute.class);
153 posIncrAtt = addAttribute(PositionIncrementAttribute.class);
154 offsetAtt = addAttribute(OffsetAttribute.class);
155 }
156
157 @Override
158 public boolean incrementToken() {
159 if (tokenUpto >= tokens.length) {
160 return false;
161 } else {
162 final TestToken testToken = tokens[tokenUpto++];
163 clearAttributes();
164 termAtt.append(testToken.text);
165 offsetAtt.setOffset(testToken.startOffset, testToken.endOffset);
166 if (tokenUpto > 1) {
167 posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
168 } else {
169 posIncrAtt.setPositionIncrement(testToken.pos+1);
170 }
171 return true;
172 }
173 }
174
175 @Override
176 public void reset() throws IOException {
177 super.reset();
178 this.tokenUpto = 0;
179 }
180 }
181
182 private class MyAnalyzer extends Analyzer {
183 @Override
184 public TokenStreamComponents createComponents(String fieldName) {
185 return new TokenStreamComponents(new MyTokenizer());
186 }
187 }
188
189 public void test() throws IOException {
190
191 DirectoryReader reader = DirectoryReader.open(dir);
192 for (LeafReaderContext ctx : reader.leaves()) {
193 SegmentReader sr = (SegmentReader) ctx.reader();
194 assertTrue(sr.getFieldInfos().hasVectors());
195 }
196 reader.close();
197 }
198
199 public void testReader() throws IOException {
200 TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
201 for (int j = 0; j < 5; j++) {
202 Terms vector = reader.get(j).terms(testFields[0]);
203 assertNotNull(vector);
204 assertEquals(testTerms.length, vector.size());
205 TermsEnum termsEnum = vector.iterator();
206 for (int i = 0; i < testTerms.length; i++) {
207 final BytesRef text = termsEnum.next();
208 assertNotNull(text);
209 String term = text.utf8ToString();
210
211 assertEquals(testTerms[i], term);
212 }
213 assertNull(termsEnum.next());
214 }
215 reader.close();
216 }
217
218 public void testDocsEnum() throws IOException {
219 TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
220 for (int j = 0; j < 5; j++) {
221 Terms vector = reader.get(j).terms(testFields[0]);
222 assertNotNull(vector);
223 assertEquals(testTerms.length, vector.size());
224 TermsEnum termsEnum = vector.iterator();
225 PostingsEnum postingsEnum = null;
226 for (int i = 0; i < testTerms.length; i++) {
227 final BytesRef text = termsEnum.next();
228 assertNotNull(text);
229 String term = text.utf8ToString();
230
231 assertEquals(testTerms[i], term);
232
233 postingsEnum = TestUtil.docs(random(), termsEnum, postingsEnum, PostingsEnum.NONE);
234 assertNotNull(postingsEnum);
235 int doc = postingsEnum.docID();
236 assertEquals(-1, doc);
237 assertTrue(postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
238 assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
239 }
240 assertNull(termsEnum.next());
241 }
242 reader.close();
243 }
244
245 public void testPositionReader() throws IOException {
246 TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
247 BytesRef[] terms;
248 Terms vector = reader.get(0).terms(testFields[0]);
249 assertNotNull(vector);
250 assertEquals(testTerms.length, vector.size());
251 TermsEnum termsEnum = vector.iterator();
252 PostingsEnum dpEnum = null;
253 for (int i = 0; i < testTerms.length; i++) {
254 final BytesRef text = termsEnum.next();
255 assertNotNull(text);
256 String term = text.utf8ToString();
257
258 assertEquals(testTerms[i], term);
259
260 dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
261 assertNotNull(dpEnum);
262 int doc = dpEnum.docID();
263 assertEquals(-1, doc);
264 assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
265 assertEquals(dpEnum.freq(), positions[i].length);
266 for (int j = 0; j < positions[i].length; j++) {
267 assertEquals(positions[i][j], dpEnum.nextPosition());
268 }
269 assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
270
271 dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
272 doc = dpEnum.docID();
273 assertEquals(-1, doc);
274 assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
275 assertNotNull(dpEnum);
276 assertEquals(dpEnum.freq(), positions[i].length);
277 for (int j = 0; j < positions[i].length; j++) {
278 assertEquals(positions[i][j], dpEnum.nextPosition());
279 assertEquals(j*10, dpEnum.startOffset());
280 assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
281 }
282 assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
283 }
284
285 Terms freqVector = reader.get(0).terms(testFields[1]);
286 assertNotNull(freqVector);
287 assertEquals(testTerms.length, freqVector.size());
288 termsEnum = freqVector.iterator();
289 assertNotNull(termsEnum);
290 for (int i = 0; i < testTerms.length; i++) {
291 final BytesRef text = termsEnum.next();
292 assertNotNull(text);
293 String term = text.utf8ToString();
294
295 assertEquals(testTerms[i], term);
296 assertNotNull(termsEnum.postings(null));
297 assertNotNull(termsEnum.postings(null, PostingsEnum.ALL));
298 }
299 reader.close();
300 }
301
302 public void testOffsetReader() throws IOException {
303 TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
304 Terms vector = reader.get(0).terms(testFields[0]);
305 assertNotNull(vector);
306 TermsEnum termsEnum = vector.iterator();
307 assertNotNull(termsEnum);
308 assertEquals(testTerms.length, vector.size());
309 PostingsEnum dpEnum = null;
310 for (int i = 0; i < testTerms.length; i++) {
311 final BytesRef text = termsEnum.next();
312 assertNotNull(text);
313 String term = text.utf8ToString();
314 assertEquals(testTerms[i], term);
315
316 dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
317 assertNotNull(dpEnum);
318 assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
319 assertEquals(dpEnum.freq(), positions[i].length);
320 for (int j = 0; j < positions[i].length; j++) {
321 assertEquals(positions[i][j], dpEnum.nextPosition());
322 }
323 assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
324
325 dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
326 assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
327 assertNotNull(dpEnum);
328 assertEquals(dpEnum.freq(), positions[i].length);
329 for (int j = 0; j < positions[i].length; j++) {
330 assertEquals(positions[i][j], dpEnum.nextPosition());
331 assertEquals(j*10, dpEnum.startOffset());
332 assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
333 }
334 assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
335 }
336 reader.close();
337 }
338
339 public void testIllegalIndexableField() throws Exception {
340 Directory dir = newDirectory();
341 MockAnalyzer a = new MockAnalyzer(random());
342 a.setEnableChecks(false);
343 RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
344 FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
345 ft.setStoreTermVectors(true);
346 ft.setStoreTermVectorPayloads(true);
347 Document doc = new Document();
348 doc.add(new Field("field", "value", ft));
349 try {
350 w.addDocument(doc);
351 fail("did not hit exception");
352 } catch (IllegalArgumentException iae) {
353
354 assertEquals("cannot index term vector payloads without term vector positions (field=\"field\")", iae.getMessage());
355 }
356
357 ft = new FieldType(TextField.TYPE_NOT_STORED);
358 ft.setStoreTermVectors(false);
359 ft.setStoreTermVectorOffsets(true);
360 doc = new Document();
361 doc.add(new Field("field", "value", ft));
362 try {
363 w.addDocument(doc);
364 fail("did not hit exception");
365 } catch (IllegalArgumentException iae) {
366
367 assertEquals("cannot index term vector offsets when term vectors are not indexed (field=\"field\")", iae.getMessage());
368 }
369
370 ft = new FieldType(TextField.TYPE_NOT_STORED);
371 ft.setStoreTermVectors(false);
372 ft.setStoreTermVectorPositions(true);
373 doc = new Document();
374 doc.add(new Field("field", "value", ft));
375 try {
376 w.addDocument(doc);
377 fail("did not hit exception");
378 } catch (IllegalArgumentException iae) {
379
380 assertEquals("cannot index term vector positions when term vectors are not indexed (field=\"field\")", iae.getMessage());
381 }
382
383 ft = new FieldType(TextField.TYPE_NOT_STORED);
384 ft.setStoreTermVectors(false);
385 ft.setStoreTermVectorPayloads(true);
386 doc = new Document();
387 doc.add(new Field("field", "value", ft));
388 try {
389 w.addDocument(doc);
390 fail("did not hit exception");
391 } catch (IllegalArgumentException iae) {
392
393 assertEquals("cannot index term vector payloads when term vectors are not indexed (field=\"field\")", iae.getMessage());
394 }
395
396 ft = new FieldType(TextField.TYPE_NOT_STORED);
397 ft.setStoreTermVectors(true);
398 ft.setStoreTermVectorPayloads(true);
399 doc = new Document();
400 doc.add(new Field("field", "value", ft));
401 try {
402 w.addDocument(doc);
403 fail("did not hit exception");
404 } catch (IllegalArgumentException iae) {
405
406 assertEquals("cannot index term vector payloads without term vector positions (field=\"field\")", iae.getMessage());
407 }
408
409 ft = new FieldType(StoredField.TYPE);
410 ft.setStoreTermVectors(true);
411 doc = new Document();
412 doc.add(new Field("field", "value", ft));
413 try {
414 w.addDocument(doc);
415 fail("did not hit exception");
416 } catch (IllegalArgumentException iae) {
417
418 assertEquals("cannot store term vectors for a field that is not indexed (field=\"field\")", iae.getMessage());
419 }
420
421 ft = new FieldType(StoredField.TYPE);
422 ft.setStoreTermVectorPositions(true);
423 doc = new Document();
424 doc.add(new Field("field", "value", ft));
425 try {
426 w.addDocument(doc);
427 fail("did not hit exception");
428 } catch (IllegalArgumentException iae) {
429
430 assertEquals("cannot store term vector positions for a field that is not indexed (field=\"field\")", iae.getMessage());
431 }
432
433 ft = new FieldType(StoredField.TYPE);
434 ft.setStoreTermVectorOffsets(true);
435 doc = new Document();
436 doc.add(new Field("field", "value", ft));
437 try {
438 w.addDocument(doc);
439 fail("did not hit exception");
440 } catch (IllegalArgumentException iae) {
441
442 assertEquals("cannot store term vector offsets for a field that is not indexed (field=\"field\")", iae.getMessage());
443 }
444
445 ft = new FieldType(StoredField.TYPE);
446 ft.setStoreTermVectorPayloads(true);
447 doc = new Document();
448 doc.add(new Field("field", "value", ft));
449 try {
450 w.addDocument(doc);
451 fail("did not hit exception");
452 } catch (IllegalArgumentException iae) {
453
454 assertEquals("cannot store term vector payloads for a field that is not indexed (field=\"field\")", iae.getMessage());
455 }
456
457 w.close();
458
459 dir.close();
460 }
461 }